start.time <- Sys.time()
library(tidyverse)
library(huxtable)
library(ggplot2)
library(lubridate)
library(egg)
library(gridExtra)
library(ggpattern)
library(spsurvey) #This code was written for spsurvey v5.0.0
To run cat_analysis(), the dataframe will need to include columns for the categorical variable you are interested in, weight for each site, xcoordinate and ycoordinate. Optionally, you can also include a subpopulation or stratum column to separate results and site IDs for two-stage analysis (for example, if there are repeated sites).
In this script, weight is adjusted because of the proportional design. In the future, this can be excluded as the design should include weights already.
I am designating relevant variables from the dataset as well as creating new variables: mean dissolved oxygen from 0-2m, epilimentic means for ph and spconductance, total nitrogen and DIN:TP ratio.
# retrieve raw data from database
setwd("~/OneDrive - New York State Office of Information Technology Services/Rscripts/Current")
source("new_database/Reading.LMAS.Data.R")
setwd("~/OneDrive - New York State Office of Information Technology Services/Rscripts/Probability.Sampling/")
rm(list=setdiff(ls(), c('newdata',"start.time")))
#list relevant variables
lab<-newdata %>%
filter(SAMPLE_DATE>'2020-01-01') %>%
mutate(combined=paste(CHARACTERISTIC_NAME,
INFORMATION_TYPE,
RSLT_RESULT_SAMPLE_FRACTION,
sep = "_")) %>%
select(LAKE_HISTORY_ID,
SAMPLE_DATE,
combined,
RSLT_RESULT_VALUE,
RSLT_LABORATORY_QUALIFIER,
RSLT_VALIDATOR_QUALIFIER,
RSLT_PROFILE_DEPTH) %>%
mutate(RSLT_RESULT_VALUE=ifelse(!is.na(RSLT_LABORATORY_QUALIFIER)&(RSLT_LABORATORY_QUALIFIER=="U"|RSLT_LABORATORY_QUALIFIER=="UE"),"0",RSLT_RESULT_VALUE),
RSLT_RESULT_VALUE=as.numeric(RSLT_RESULT_VALUE)) %>%
filter(!is.na(RSLT_RESULT_VALUE),
is.na(RSLT_VALIDATOR_QUALIFIER)|(RSLT_VALIDATOR_QUALIFIER!="R"),
combined %in% c('CHLOROPHYLL A_OW_TOTAL',
'PHOSPHORUS, TOTAL_OW_TOTAL',
"MICROCYSTIN_OW_NA",
"NITROGEN, NITRATE (AS N)_OW_TOTAL",
"NITROGEN, NITRATE-NITRITE_OW_TOTAL",
"NITROGEN, KJELDAHL, TOTAL_OW_TOTAL",
"NITROGEN, TOTAL_OW_TOTAL",
"DISSOLVED OXYGEN_DP_NA",
"TRUE COLOR_OW_TOTAL",
"PH_DP_NA",
"SPECIFIC CONDUCTANCE_DP_NA",
"TEMPERATURE_DP_NA",
"DEPTH, SECCHI DISK DEPTH_SD_NA",
"CALCIUM_OW_TOTAL",
"CHLORIDE_OW_TOTAL",
"CHLOROPHYLL A (PROBE) CONCENTRATION, CHLOROPHYTE (GREEN ALGAE)_OW_NA",
"CHLOROPHYLL A (PROBE) CONCENTRATION, CRYPTOPHYTA (CRYPTOPHYTES)_OW_NA",
"CHLOROPHYLL A (PROBE) CONCENTRATION, CYANOBACTERIA (BLUEGREEN)_OW_NA",
"CHLOROPHYLL A (PROBE) CONCENTRATION, DINOPHYTA (DIATOMS)_OW_NA",
"CHLOROPHYLL A (PROBE) CONCENTRATION, TOTAL_OW_NA",
"ALKALINITY, TOTAL (AS CACO3)_OW_TOTAL",
"ARSENIC_BS_TOTAL",
"ARSENIC_OW_TOTAL",
"IRON_BS_TOTAL",
"IRON_OW_TOTAL",
"MAGNESIUM_BS_TOTAL",
"MAGNESIUM_OW_TOTAL")) %>%
select(LAKE_HISTORY_ID,SAMPLE_DATE,combined,RSLT_RESULT_VALUE,RSLT_PROFILE_DEPTH) %>%
distinct(LAKE_HISTORY_ID,SAMPLE_DATE,combined,RSLT_PROFILE_DEPTH,.keep_all = TRUE) %>%
rename(LAKE_ID=LAKE_HISTORY_ID,
chemical_name=combined,
result_value=RSLT_RESULT_VALUE,
profile_depth=RSLT_PROFILE_DEPTH)
# dissolved oxygen:
# keep DO values between 0 and 2m depth
lab2 <- lab %>%
mutate(keep=case_when(
!(chemical_name=="DISSOLVED OXYGEN_DP_NA") ~ "no",
chemical_name=="DISSOLVED OXYGEN_DP_NA" & profile_depth <= 2 ~ "yes",
chemical_name=="DISSOLVED OXYGEN_DP_NA" & profile_depth > 2 ~ "no")) %>%
filter(keep!="no") %>%
select(-keep)
# mean DO for each lake each date
DO <- aggregate(lab2$result_value,list(lab2$LAKE_ID,lab2$SAMPLE_DATE),FUN=mean) %>%
mutate(chemical_name="DISSOLVED OXYGEN_epi") %>%
rename(LAKE_ID=Group.1,
SAMPLE_DATE=Group.2,
result_value=x)
#thermocline
thermocline<-lab %>%
filter(chemical_name=='TEMPERATURE_DP_NA') %>%
mutate(thermocline=NA)
LID<-thermocline$LAKE_ID[1]
date<-thermocline$SAMPLE_DATE[1]
depth<-thermocline$profile_depth[1]
temp<-thermocline$result_value[1]
for(i in seq(nrow(thermocline))){
current<-thermocline[i,]
if(current$LAKE_ID==LID¤t$SAMPLE_DATE==date){
depth=current$profile_depth
if((temp-current$result_value)>1){
thermocline$thermocline[i]<-current$profile_depth
}
temp=current$result_value
}
else{
LID=current$LAKE_ID
date=current$SAMPLE_DATE
depth=current$profile_depth
temp=current$result_value
}
}
thermocline<-thermocline %>% #pull the lowest value for all
group_by(LAKE_ID,SAMPLE_DATE) %>%
mutate(thermocline=min(thermocline, na.rm=T)) %>%
ungroup() %>%
mutate(thermocline=ifelse(thermocline==Inf,NA,thermocline)) %>%
select(LAKE_ID,SAMPLE_DATE,thermocline,profile_depth) %>%
filter(!is.na(thermocline)) %>% #remove those without a thermocline
distinct()
#epilimentic means for ph and spconductance
epi<-lab %>%
inner_join(thermocline,by=c('LAKE_ID','SAMPLE_DATE','profile_depth')) %>%
filter(chemical_name %in% c("PH_DP_NA","SPECIFIC CONDUCTANCE_DP_NA")) %>%
mutate(result_value = ifelse(chemical_name=="PH_DP_NA",(10^result_value),result_value)) %>%
distinct() %>%
arrange(LAKE_ID,SAMPLE_DATE,chemical_name,profile_depth)
epi<-epi %>%
filter(profile_depth<=thermocline) %>%
distinct() %>%
group_by(LAKE_ID,SAMPLE_DATE,chemical_name) %>%
summarize(Mean=mean(result_value,na.rm=TRUE),
n=n()) %>%
filter(n>2) %>%
select(LAKE_ID,SAMPLE_DATE,chemical_name,Mean)
epi<-epi %>%
mutate(Mean=ifelse(chemical_name=="PH_DP_NA",log10(Mean),Mean)) %>%
rename(result_value=Mean) %>%
mutate(chemical_name=case_when(
chemical_name=="PH_DP_NA"~"PH_epi",
chemical_name=="SPECIFIC CONDUCTANCE_DP_NA"~"SPECIFIC CONDUCTANCE_epi"
))
# read in site data
sites<-read.csv("Probability_Based_Sites_2020_2021.csv")
sites<-sites %>%
rename(siteID=SITE_ID,
xcoord=LON_DD83,
ycoord=LAT_DD83,
Eval_Status=EvalStatus) %>%
filter(Eval_Status!="") %>% #removing sites that we haven't yet evaluated
mutate(Eval_Status=trimws(Eval_Status))
# restrict to only the data in the probability study and spread the data
att<-merge(lab,epi,by=c("LAKE_ID","SAMPLE_DATE","chemical_name"),all=TRUE) %>%
mutate(result_value=ifelse(is.na(result_value.x),result_value.y,result_value.x)) %>%
select(-result_value.x,-result_value.y)
att<-merge(att,DO,by=c("LAKE_ID","SAMPLE_DATE","chemical_name"),all=TRUE)%>%
mutate(result_value=ifelse(is.na(result_value.x),result_value.y,result_value.x)) %>%
filter(!chemical_name %in% c("DISSOLVED OXYGEN_DP_NA","PH_DP_NA","SPECIFIC CONDUCTANCE_DP_NA","TEMPERATURE_DP_NA")) %>%
select(-result_value.x,-result_value.y)
att<-merge(att,sites,by=c('LAKE_ID'),all.y = TRUE) %>% distinct()
att<-att %>% spread(chemical_name,result_value,fill = NA)
att<-att %>% filter((Eval_Status=="Target_Sampled"&!is.na(`CHLOROPHYLL A_OW_TOTAL`)&!is.na(`PHOSPHORUS, TOTAL_OW_TOTAL`)) |
(Eval_Status!="Target_Sampled") |
(LAKE_ID %in% c("0703UWBXXX1","0801KAY0984A","1203MET0821","0602LUD0099","0801GUL0969")))
#creating thresholds
att<-att %>%
# remove fields in the sites table that aren't relevant
select(-Accessible,-Comments,-Contact,-STATUS) %>%
# create total nitrogen
mutate(`NITROGEN, TOTAL`=case_when(
!is.na(`NITROGEN, KJELDAHL, TOTAL_OW_TOTAL`) ~
(`NITROGEN, NITRATE-NITRITE_OW_TOTAL`+`NITROGEN, KJELDAHL, TOTAL_OW_TOTAL`),
is.na(`NITROGEN, KJELDAHL, TOTAL_OW_TOTAL`) ~ `NITROGEN, TOTAL_OW_TOTAL`)) %>%
# DIN:TP
mutate(DINTP=`NITROGEN, NITRATE (AS N)_OW_TOTAL`/`PHOSPHORUS, TOTAL_OW_TOTAL`) %>%
# trophic status
mutate(phos_trophic=case_when(
`PHOSPHORUS, TOTAL_OW_TOTAL`<=0.01 ~ "Oligotrophic",
between(`PHOSPHORUS, TOTAL_OW_TOTAL`,0.01,0.02) ~ "Mesotrophic",
`PHOSPHORUS, TOTAL_OW_TOTAL`>=0.02 ~ "Eutrophic")) %>%
mutate(chla_trophic=case_when(
`CHLOROPHYLL A_OW_TOTAL`<=2 ~ "Oligotrophic",
between(`CHLOROPHYLL A_OW_TOTAL`,2,8) ~ "Mesotrophic",
`CHLOROPHYLL A_OW_TOTAL`>=8 ~ "Eutrophic")) %>%
# EPA thresholds
mutate(TP_threshold=case_when(
`PHOSPHORUS, TOTAL_OW_TOTAL`<=0.016 ~ "Good",
between(`PHOSPHORUS, TOTAL_OW_TOTAL`, 0.016, 0.0279) ~ "Fair",
`PHOSPHORUS, TOTAL_OW_TOTAL`>=0.0279 ~ "Poor")) %>%
mutate(TN_threshold=case_when(
`NITROGEN, TOTAL`<=0.428 ~ "Good",
between(`NITROGEN, TOTAL`, 0.428, 0.655) ~ "Fair",
`NITROGEN, TOTAL`>=0.655 ~ "Poor")) %>%
mutate(CHLA_threshold=case_when(
`CHLOROPHYLL A_OW_TOTAL`<=4.52 ~ "Good",
between(`CHLOROPHYLL A_OW_TOTAL`, 4.52, 8.43) ~ "Fair",
`CHLOROPHYLL A_OW_TOTAL`>=8.43 ~ "Poor")) %>%
# microcystin
mutate(microcystin=case_when(
is.na(`MICROCYSTIN_OW_NA`) ~"Non-detect",
`MICROCYSTIN_OW_NA`<8 ~ "Microcystin Detected",
`MICROCYSTIN_OW_NA`>=8 ~ "Most disturbed")) %>%
# dissolved oxygen
mutate(d.oxygen=case_when(
`DISSOLVED OXYGEN_epi`<=3 ~ "Poor",
between(`DISSOLVED OXYGEN_epi`, 3, 5) ~ "Fair",
`DISSOLVED OXYGEN_epi`>=5 ~ "Good")) %>%
#nutrient limitation
mutate(N_LIMIT=case_when(
`NITROGEN, TOTAL`/`PHOSPHORUS, TOTAL_OW_TOTAL`<=10 ~ "N-limited",
between(`NITROGEN, TOTAL`/`PHOSPHORUS, TOTAL_OW_TOTAL`, 10, 20) ~ "Co-limited",
`NITROGEN, TOTAL`/`PHOSPHORUS, TOTAL_OW_TOTAL`>=20 ~ "P-limited")) %>%
#secchi
mutate(secchi=case_when(
`DEPTH, SECCHI DISK DEPTH_SD_NA`<=2 ~ "Eutrophic",
between(`DEPTH, SECCHI DISK DEPTH_SD_NA`, 2, 5) ~ "Mesotrophic",
`DEPTH, SECCHI DISK DEPTH_SD_NA`>=5 ~ "Oligotrophic")) %>%
#zebra mussels
mutate(zebra=case_when(
CALCIUM_OW_TOTAL<=10 ~ "Not susceptible",
between(CALCIUM_OW_TOTAL, 10, 20) ~ "May be susceptible",
CALCIUM_OW_TOTAL>=20 ~ "Highly susceptible")) %>%
#cslap
mutate(conductance=case_when(
`SPECIFIC CONDUCTANCE_epi`<=125 ~ "Soft",
between(`SPECIFIC CONDUCTANCE_epi`, 125, 250) ~ "Average",
`SPECIFIC CONDUCTANCE_epi`>=250 ~ "Hard")) %>%
mutate(color=case_when(
`TRUE COLOR_OW_TOTAL`<=10 ~ "Uncolored",
between(`TRUE COLOR_OW_TOTAL`, 10, 30) ~ "Weak",
`TRUE COLOR_OW_TOTAL`>=30 ~ "High")) %>%
mutate(ph=case_when(
`PH_epi`<=6.5 ~ "Acidic",
between(`PH_epi`, 6.5, 7.5) ~ "~Neutral",
between(`PH_epi`, 7.5, 8.5) ~ "Slightly alk",
`PH_epi`>=8.5 ~ "Highly alk")) %>%
#leech
mutate(leech=case_when(
`PHOSPHORUS, TOTAL_OW_TOTAL`<=0.030 & `TRUE COLOR_OW_TOTAL`<=20 ~ "Blue",
`PHOSPHORUS, TOTAL_OW_TOTAL`>0.030 & `TRUE COLOR_OW_TOTAL`<=20 ~ "Green",
`PHOSPHORUS, TOTAL_OW_TOTAL`<=0.030 & `TRUE COLOR_OW_TOTAL`>20 ~ "Brown",
`PHOSPHORUS, TOTAL_OW_TOTAL`>0.030 & `TRUE COLOR_OW_TOTAL`>20 ~ "Murky"
)) %>%
#chloride
mutate(chloride=case_when(
`CHLORIDE_OW_TOTAL` <= 35 ~ "Low",
between(`CHLORIDE_OW_TOTAL`,35,250) ~ "Medium",
`CHLORIDE_OW_TOTAL` >= 250 ~ "High",
Eval_Status=="Target_Sampled" & is.na(`CHLORIDE_OW_TOTAL`) ~ "Low")) %>%
#chlorophyll
rename(CHLOROPHYTE=`CHLOROPHYLL A (PROBE) CONCENTRATION, CHLOROPHYTE (GREEN ALGAE)_OW_NA`,
CRYPTOPHYTA=`CHLOROPHYLL A (PROBE) CONCENTRATION, CRYPTOPHYTA (CRYPTOPHYTES)_OW_NA`,
CYANOBACTERIA=`CHLOROPHYLL A (PROBE) CONCENTRATION, CYANOBACTERIA (BLUEGREEN)_OW_NA`,
DINOPHYTA=`CHLOROPHYLL A (PROBE) CONCENTRATION, DINOPHYTA (DIATOMS)_OW_NA`,
PROBE_TOTAL=`CHLOROPHYLL A (PROBE) CONCENTRATION, TOTAL_OW_NA`) %>%
mutate(chlorophyte=case_when(
CHLOROPHYTE/PROBE_TOTAL<=0.25 ~ "Low",
between(CHLOROPHYTE/PROBE_TOTAL,0.25,0.5) ~ "Medium",
CHLOROPHYTE/PROBE_TOTAL>=0.5 ~ "High")) %>%
mutate(cryptophyta=case_when(
CRYPTOPHYTA/PROBE_TOTAL<=0.25 ~ "Low",
between(CRYPTOPHYTA/PROBE_TOTAL,0.25,0.5) ~ "Medium",
CRYPTOPHYTA/PROBE_TOTAL>=0.5 ~ "High")) %>%
mutate(cyanobacteria=case_when(
CYANOBACTERIA/PROBE_TOTAL<=0.25 ~ "Low",
between(CYANOBACTERIA/PROBE_TOTAL,0.25,0.5) ~ "Medium",
CYANOBACTERIA/PROBE_TOTAL>=0.5 ~ "High")) %>%
mutate(dinophyta=case_when(
DINOPHYTA/PROBE_TOTAL<=0.25 ~ "Low",
between(DINOPHYTA/PROBE_TOTAL,0.25,0.5) ~ "Medium",
DINOPHYTA/PROBE_TOTAL>=0.5 ~ "High")) %>%
#alkalinity
mutate(alkalinity=case_when(
`ALKALINITY, TOTAL (AS CACO3)_OW_TOTAL` <= 60 ~ "Soft",
between(`ALKALINITY, TOTAL (AS CACO3)_OW_TOTAL`,60,120) ~ "Moderately hard",
between(`ALKALINITY, TOTAL (AS CACO3)_OW_TOTAL`,120,180) ~ "Hard",
`ALKALINITY, TOTAL (AS CACO3)_OW_TOTAL` >= 180 ~ "Very hard"))
# Create a Target/NonTarget status variable
att<-att %>%
mutate(statusTNT="Target",
statusTNT=ifelse(Eval_Status=="NonTarget","NonTarget",statusTNT))
# remove duplicates (multiple samples of same lake)
att <- att %>%
mutate(SAMPLE_DATE = ymd(SAMPLE_DATE)) %>% # convert to date
mutate(month = month(SAMPLE_DATE)) %>% # extract month from date
group_by(siteID) %>%
mutate(has_aug = 8 %in% month) %>% # annotate as having date in august
filter(has_aug & month == 8 |
!has_aug) %>% # filter only august dates from sites with or all from those without
slice_max(n = 1,
order_by = SAMPLE_DATE,
with_ties = F) %>% # change with_ties to TRUE to discard NA values
select(-has_aug)
att<-att %>%
ungroup() %>%
mutate(WgtAdj=case_when(
PROB_CAT=="(1,4]"~(1828/29),
PROB_CAT=="(4,10]"~(2490/15),
PROB_CAT=="(10,20]"~(1003/18),
PROB_CAT=="(20,50]"~(616/20),
PROB_CAT==">50"~(500/33)))
# rm(list=setdiff(ls(), c('newdata',"att","sites")))
Additionally, I will load in NLA, NAP and NH data:
# read in NLA data
setwd("~/OneDrive - New York State Office of Information Technology Services/Rscripts/Probability.Sampling/")
chem <- read.csv("nla_2017_water_chemistry_chla-data.csv")
chem <- chem %>%
filter(ANALYTE %in% c("NTL","CHLA","PTL","NITRATE_N","COLOR","CHLORIDE"),
NARS_FLAG == "") %>%
select(UID,SITE_ID,DATE_COL,ANALYTE,RESULT) %>%
pivot_wider(names_from= ANALYTE ,values_from = RESULT,) %>%
rename(TN=NTL,TP=PTL)
oxygen <- read.csv("nla_2017_profile-data.csv")
oxygen <- oxygen %>%
select(UID,SITE_ID,DATE_COL,DEPTH,OXYGEN) %>%
# dissolved oxygen:
# keep DO values between 0 and 2m depth
mutate(keep=case_when(
DEPTH <= 2 ~ "yes",
DEPTH > 2 ~ "no")) %>%
filter(keep=="yes") %>%
select(-keep)
# mean DO for each lake
DO <- aggregate(oxygen$OXYGEN,list(oxygen$SITE_ID,oxygen$DATE_COL),FUN=mean)
# add back in
oxygen <- merge(oxygen,DO,by.x=c("SITE_ID","DATE_COL"),by.y=c("Group.1","Group.2"),all.x=TRUE)
oxygen$x <- as.numeric(oxygen$x)
oxygen <- oxygen %>%
select(-c(DEPTH,OXYGEN)) %>%
rename(OXYGEN=x) %>%
distinct()
att.nla <- merge(chem,oxygen,by=c("UID","SITE_ID","DATE_COL"),all.y=TRUE,all.x=TRUE)
att.nla$CHLA <- as.numeric(att.nla$CHLA)
att.nla$TN <- as.numeric(att.nla$TN)
att.nla$TP <- as.numeric(att.nla$TP)
att.nla$CHLORIDE <- as.numeric(att.nla$CHLORIDE)
att.nla$OXYGEN <- as.numeric(att.nla$OXYGEN)
att.nla$COLOR <- as.numeric(att.nla$COLOR)
att.nla$DINTP <- as.numeric(att.nla$NITRATE_N)/(as.numeric(att.nla$TP)/1000)
# selecting parameters and adding trophic status
att.nla<-att.nla %>%
# trophic status
mutate(phos_trophic=case_when(
TP<=10 ~ "oligotrophic",
between(TP,10,20) ~ "mesotrophic",
TP>=20 ~ "eutrophic")) %>%
mutate(chla_trophic=case_when(
CHLA<=2 ~ "oligotrophic",
between(CHLA,2,8) ~ "mesotrophic",
CHLA>=8 ~ "eutrophic")) %>%
# EPA thresholds
mutate(TP_threshold=case_when(
TP<=16 ~ "Good",
between(TP, 16, 27.9) ~ "Fair",
TP>=27.9 ~ "Poor")) %>%
mutate(TN_threshold=case_when(
TN<=0.428 ~ "Good",
between(TN, 0.428, 0.655) ~ "Fair",
TN>=0.655 ~ "Poor")) %>%
mutate(CHLA_threshold=case_when(
CHLA<=4.52 ~ "Good",
between(CHLA, 4.52, 8.43) ~ "Fair",
CHLA>=8.43 ~ "Poor")) %>%
# dissolved oxygen
mutate(d.oxygen=case_when(
OXYGEN<=3 ~ "Poor",
between(OXYGEN, 3, 5) ~ "Fair",
OXYGEN>=5 ~ "Good")) %>%
#leech
mutate(leech=case_when(
TP<=30 & COLOR<=20 ~ "Blue",
TP>30 & COLOR<=20 ~ "Green",
TP<=30 & COLOR>20 ~ "Brown",
TP>30 & COLOR>20 ~ "Murky"
)) %>%
#chloride
mutate(chloride=case_when(
CHLORIDE <= 35 ~ "Low",
between(CHLORIDE,35,250) ~ "Medium",
CHLORIDE >= 250 ~ "High"))
sites <- read.csv("nla_2017_site_information-data.csv")
nap.sites <- sites %>%
mutate(include=case_when(
AG_ECO9=="NAP" & AREA_HA>2.63 ~ "yes",
SITE_ID=TRUE ~ "no"
)) %>%
select(UID,SITE_ID,AREA_CAT6,EVAL_CAT,YCOORD,XCOORD,WGT_TP_EXTENT,include) %>%
filter(WGT_TP_EXTENT != 0)
nla.sites <- sites %>%
mutate(include=case_when(
AREA_HA>2.63 ~ "yes",
SITE_ID=TRUE ~ "no"
)) %>%
select(UID,SITE_ID,AREA_CAT6,EVAL_CAT,YCOORD,XCOORD,WGT_TP_EXTENT,include) %>%
filter(WGT_TP_EXTENT != 0)
nla.att <- merge(att.nla,nla.sites,by=c("UID","SITE_ID"))
nap.att <- merge(att.nla,nap.sites,by=c("UID","SITE_ID"))
nh.att <- read.csv("New_Hampshire_Lakes_2017_Design_Status_20211027_KH_EPA.csv", na.strings=c(""," ","NA"))
# selecting parameters and adding trophic status
nh.att<-nh.att %>%
mutate(include=case_when(
AREA_HA>2.63 ~ "yes",
SITE_ID=TRUE ~ "no"))
chem <- read.csv("BasicNHChem_ForNY.csv")
#merge
nh.att <- merge(nh.att,chem,by.x="SITE_ID",by.y="NLA.ID",all.x=T) %>%
mutate(CHLORIDE=case_when(
CHLORIDE!="<3" ~ as.numeric(CHLORIDE),
CHLORIDE=="<3" ~ 0)) %>%
mutate(CHLORIDE_threshold=case_when(
CHLORIDE<=35 ~ "Low",
between(CHLORIDE,35,250) ~ "Medium",
CHLORIDE>=250 ~ "High"
))
Here I perform the analysis using cat_analysis(). The relevant categorical variable / threshold should be input as the “vars” argument. “vars” can be a list of all categories you are interested in, which would create a very big dataframe.
In the future, the “weight” argument should be part of the site design and might not be called “WgtAdj”.
#list variables you are interested in and defined above
vars <- c("phos_trophic", "chla_trophic", "TP_threshold", "TN_threshold", "CHLA_threshold", "microcystin", "d.oxygen", "N_LIMIT", "secchi", "zebra", "conductance", "color", "ph", "leech", "chloride","alkalinity","chlorophyte","cryptophyta","cyanobacteria","dinophyta")
#analysis
CatExtent <- cat_analysis(
dframe=att,
vars=vars,
subpops = , #for example could separate by area category or year
siteID = "siteID",
weight = "WgtAdj", #name will probably change in future
xcoord = "xcoord",
ycoord = "ycoord")
table <- CatExtent %>%
select(Indicator,
Category,
Estimate.P,
LCB95Pct.P,
UCB95Pct.P) %>%
filter(Category!="Total") %>%
mutate(Category=factor(Category, levels=c("Poor","Fair","Good",
"Blue","Green","Brown","Murky",
"High","Weak","Uncolored",
"Medium","Low",
"Acidic","~Neutral","Slightly alk","Highly alk",
"Soft","Moderately hard","Average","Hard","Very hard",
"Highly susceptible","May be susceptible","Not susceptible",
"Eutrophic","Mesotrophic","Oligotrophic",
"N-limited","Co-limited","P-limited",
"Non-detect","Microcystin detected","Most disturbed")),
Indicator=case_when(
Indicator=="phos_trophic"~"Phosphorus",
Indicator=="chla_trophic"~"Chlorophyll-a",
Indicator=="TP_threshold"~"Total phosphorus",
Indicator=="TN_threshold"~"Total nitrogen",
Indicator=="CHLA_threshold"~"Total chlorophyll",
Indicator=="microcystin"~"Microcystin",
Indicator=="d.oxygen"~"Dissolved oxygen",
Indicator=="N_LIMIT"~"Nutrient limitation",
Indicator=="secchi"~"Secchi",
Indicator=="zebra"~"Zebra mussel susceptibility",
Indicator=="conductance"~"Hardness",
Indicator=="color"~"Color",
Indicator=="ph"~"pH",
Indicator=="leech"~"Nutrient-color status",
Indicator=="chloride"~"Chloride",
Indicator=TRUE~Indicator)
)
hux <- as_hux(table) #I use huxtable because kable doesn't work on my computer but that is also fine
number_format(hux) <- 2
theme_plain(hux)
| Indicator | Category | Estimate.P | LCB95.00Pct.P | UCB95.00Pct.P |
|---|---|---|---|---|
| Phosphorus | Eutrophic | 21.55 | 10.71 | 32.38 |
| Phosphorus | Mesotrophic | 36.69 | 21.07 | 52.32 |
| Phosphorus | Oligotrophic | 41.76 | 27.81 | 55.71 |
| Chlorophyll-a | Eutrophic | 12.30 | 5.67 | 18.92 |
| Chlorophyll-a | Mesotrophic | 69.24 | 56.29 | 82.19 |
| Chlorophyll-a | Oligotrophic | 18.47 | 6.71 | 30.22 |
| Total phosphorus | Fair | 21.24 | 9.29 | 33.20 |
| Total phosphorus | Good | 62.55 | 49.64 | 75.46 |
| Total phosphorus | Poor | 16.20 | 5.80 | 26.61 |
| Total nitrogen | Fair | 18.21 | 8.34 | 28.07 |
| Total nitrogen | Good | 63.71 | 49.46 | 77.97 |
| Total nitrogen | Poor | 18.08 | 5.40 | 30.76 |
| Total chlorophyll | Fair | 35.43 | 19.54 | 51.31 |
| Total chlorophyll | Good | 52.28 | 36.55 | 68.01 |
| Total chlorophyll | Poor | 12.30 | 5.67 | 18.92 |
| Microcystin | Non-detect | 100.00 | 100.00 | 100.00 |
| Dissolved oxygen | Fair | 1.24 | 0.00 | 3.41 |
| Dissolved oxygen | Good | 91.49 | 80.81 | 100.00 |
| Dissolved oxygen | Poor | 7.27 | 0.00 | 17.81 |
| Nutrient limitation | Co-limited | 16.52 | 3.27 | 29.77 |
| Nutrient limitation | N-limited | 0.74 | 0.00 | 2.03 |
| Nutrient limitation | P-limited | 82.74 | 69.41 | 96.07 |
| Secchi | Eutrophic | 47.12 | 32.35 | 61.90 |
| Secchi | Mesotrophic | 48.11 | 33.20 | 63.02 |
| Secchi | Oligotrophic | 4.76 | 0.76 | 8.77 |
| Zebra mussel susceptibility | Highly susceptible | 29.72 | 12.54 | 46.90 |
| Zebra mussel susceptibility | May be susceptible | 14.87 | 0.00 | 31.71 |
| Zebra mussel susceptibility | Not susceptible | 55.41 | 34.80 | 76.02 |
| Hardness | Average | 18.23 | 4.13 | 32.33 |
| Hardness | Hard | 22.69 | 7.49 | 37.89 |
| Hardness | Soft | 59.08 | 43.52 | 74.64 |
| Color | High | 68.64 | 52.20 | 85.07 |
| Color | Uncolored | 8.28 | 0.14 | 16.41 |
| Color | Weak | 23.09 | 10.25 | 35.92 |
| pH | ~Neutral | 46.31 | 29.06 | 63.56 |
| pH | Acidic | 14.93 | 4.35 | 25.51 |
| pH | Highly alk | 17.13 | 3.20 | 31.06 |
| pH | Slightly alk | 21.63 | 10.17 | 33.10 |
| Nutrient-color status | Blue | 21.59 | 7.15 | 36.04 |
| Nutrient-color status | Brown | 54.50 | 34.94 | 74.06 |
| Nutrient-color status | Green | 1.08 | 0.00 | 3.07 |
| Nutrient-color status | Murky | 22.83 | 6.76 | 38.90 |
| Chloride | High | 6.37 | 0.00 | 16.31 |
| Chloride | Low | 76.83 | 64.57 | 89.09 |
| Chloride | Medium | 16.80 | 6.68 | 26.93 |
| alkalinity | Hard | 11.04 | 0.16 | 21.92 |
| alkalinity | Moderately hard | 19.86 | 6.90 | 32.81 |
| alkalinity | Soft | 69.11 | 56.03 | 82.18 |
| chlorophyte | High | 75.45 | 58.60 | 92.30 |
| chlorophyte | Low | 13.42 | 0.00 | 28.43 |
| chlorophyte | Medium | 11.13 | 1.60 | 20.66 |
| cryptophyta | Low | 100.00 | 100.00 | 100.00 |
| cyanobacteria | High | 16.19 | 0.32 | 32.07 |
| cyanobacteria | Low | 65.93 | 47.82 | 84.03 |
| cyanobacteria | Medium | 17.88 | 6.75 | 29.00 |
| dinophyta | High | 2.11 | 0.00 | 5.90 |
| dinophyta | Low | 87.84 | 78.38 | 97.29 |
| dinophyta | Medium | 10.06 | 1.51 | 18.61 |
ny.cat <- table %>% filter(Indicator %in% c("Total phosphorus","Total nitrogen","Total chlorophyll","Dissolved oxygen","Chloride")) %>% mutate(Study="NY")
probability.cat <- table %>% filter(Indicator %in% c("Phosphorus","Chlorophyll-a","Secchi"))
Here I perform the analysis using cont_analysis(). The relevant CONTINUOUS variable should be input as the “vars” argument. “vars” can be a list of all categories you are interested in, which would create a very big dataframe with all. The output of this function is a list with 3 estimations: cumulative distribution function (CDF), percentiles, and means.
As above, in the future the “weight” argument should be part of the site design and might not be called “WgtAdj”.
#To conduct analysis on a continuous variable, using a new list of CONTINUOUS variables, also note that cont_analysis() doesn't like variable names that have spaces so you will need to rename these. It's helpful to put the units in the name.
att <- att %>% rename("CHLOROPHYLL_ug_L"=`CHLOROPHYLL A_OW_TOTAL`,
"NITROGEN_mg_L"=`NITROGEN, TOTAL`,
"PHOSPHORUS_mg_L"=`PHOSPHORUS, TOTAL_OW_TOTAL`,
"DISSOLVED_OXYGEN_mg_L"=`DISSOLVED OXYGEN_epi`,
"ALKALINITY_mg_L"=`ALKALINITY, TOTAL (AS CACO3)_OW_TOTAL`,
"CHLORIDE_mg_L"=CHLORIDE_OW_TOTAL,
"CALCIUM_mg_L"=CALCIUM_OW_TOTAL)
vars <- c("CHLOROPHYLL_ug_L","NITROGEN_mg_L","PHOSPHORUS_mg_L","DISSOLVED_OXYGEN_mg_L","CHLORIDE_mg_L","CALCIUM_mg_L","ALKALINITY_mg_L")
#Creates 3 estimations in list: CDF, percentiles, means.
analysis <- cont_analysis(
dframe = att,
vars = vars,
subpops = ,
siteID = "siteID",
weight = "WgtAdj",
xcoord = "xcoord",
ycoord = "ycoord")
myvars <- c("CHLOROPHYLL_ug_L","NITROGEN_mg_L","PHOSPHORUS_ug_L","CHLORIDE_mg_L","DISSOLVED_OXYGEN_mg_L")
NY<-analysis$CDF %>%
select(Indicator,Value,Estimate.P,LCB95Pct.P,UCB95Pct.P) %>%
mutate(Study="NY") %>%
mutate(Value=case_when(
Indicator=="PHOSPHORUS_mg_L"~as.numeric(Value)*1000,
Indicator=TRUE~as.numeric(Value)
)) %>%
mutate(Indicator=case_when(
Indicator=="PHOSPHORUS_mg_L"~"PHOSPHORUS_ug_L",
Indicator=TRUE~Indicator
)) %>%
filter(Indicator %in% c(myvars))
str(analysis)
## List of 3
## $ CDF :'data.frame': 319 obs. of 15 variables:
## ..$ Type : chr [1:319] "All_Sites" "All_Sites" "All_Sites" "All_Sites" ...
## ..$ Subpopulation : chr [1:319] "All Sites" "All Sites" "All Sites" "All Sites" ...
## ..$ Indicator : chr [1:319] "CHLOROPHYLL_ug_L" "CHLOROPHYLL_ug_L" "CHLOROPHYLL_ug_L" "CHLOROPHYLL_ug_L" ...
## ..$ Value : num [1:319] 1.00e-10 4.45e-01 5.50e-01 6.84e-01 1.05 ...
## ..$ nResp : num [1:319] 1 2 3 4 5 6 7 8 9 10 ...
## ..$ Estimate.P : num [1:319] 1.33 2.66 3.31 3.97 4.62 ...
## ..$ StdError.P : num [1:319] 1.17 1.64 1.76 1.88 1.95 ...
## ..$ MarginofError.P: num [1:319] 2.29 3.22 3.45 3.69 3.82 ...
## ..$ LCB95Pct.P : num [1:319] 0 0 0 0.279 0.801 ...
## ..$ UCB95Pct.P : num [1:319] 3.62 5.87 6.76 7.65 8.44 ...
## ..$ Estimate.U : num [1:319] 30.8 61.6 76.8 91.9 107.1 ...
## ..$ StdError.U : num [1:319] 27 37.7 39.9 42.2 43.1 ...
## ..$ MarginofError.U: num [1:319] 52.9 73.9 78.1 82.7 84.5 ...
## ..$ LCB95Pct.U : num [1:319] 0 0 0 9.18 22.51 ...
## ..$ UCB95Pct.U : num [1:319] 83.7 135.5 154.9 174.6 191.6 ...
## $ Pct :'data.frame': 49 obs. of 10 variables:
## ..$ Type : chr [1:49] "All_Sites" "All_Sites" "All_Sites" "All_Sites" ...
## ..$ Subpopulation: chr [1:49] "All Sites" "All Sites" "All Sites" "All Sites" ...
## ..$ Indicator : chr [1:49] "CHLOROPHYLL_ug_L" "CHLOROPHYLL_ug_L" "CHLOROPHYLL_ug_L" "CHLOROPHYLL_ug_L" ...
## ..$ Statistic : chr [1:49] "5Pct" "10Pct" "25Pct" "50Pct" ...
## ..$ nResp : num [1:49] 5 9 15 26 33 47 50 4 4 10 ...
## ..$ Estimate : num [1:49] 1.11 1.11 1.11 1.11 1.11 ...
## ..$ StdError : num [1:49] 0.415 0.338 0.418 0.593 1.205 ...
## ..$ MarginofError: num [1:49] 0.814 0.663 0.819 1.163 2.362 ...
## ..$ LCB95Pct : num [1:49] 0 0.598 1.643 2.613 4.613 ...
## ..$ UCB95Pct : num [1:49] 1.66 1.95 3.32 4.99 9.44 ...
## $ Mean:'data.frame': 7 obs. of 9 variables:
## ..$ Type : chr [1:7] "All_Sites" "All_Sites" "All_Sites" "All_Sites" ...
## ..$ Subpopulation: chr [1:7] "All Sites" "All Sites" "All Sites" "All Sites" ...
## ..$ Indicator : chr [1:7] "CHLOROPHYLL_ug_L" "NITROGEN_mg_L" "PHOSPHORUS_mg_L" "DISSOLVED_OXYGEN_mg_L" ...
## ..$ nResp : int [1:7] 57 49 62 57 57 33 57
## ..$ Estimate : num [1:7] 5.2437 0.5581 0.0185 7.8411 43.6026 ...
## ..$ StdError : num [1:7] 0.46823 0.08494 0.00227 0.45393 18.0289 ...
## ..$ MarginofError: num [1:7] 0.91771 0.16648 0.00444 0.88969 35.336 ...
## ..$ LCB95Pct : num [1:7] 4.326 0.3916 0.0141 6.9515 8.2666 ...
## ..$ UCB95Pct : num [1:7] 6.161 0.725 0.023 8.731 78.939 ...
Here I have plotted the analyses from above. The categorical variables are plotted as dots, maps and bars; the continuous variables are plotted as a distribution function. They are also grouped by QAPP parameter categories in the “Grouped plots” tab.
plot<-ggplot(table,aes(x=Category,y=Estimate.P)) +
geom_point()+
geom_errorbar(aes(ymin=LCB95Pct.P,ymax=UCB95Pct.P),width=0.2)+
theme(legend.position = "none")+
facet_wrap(.~Indicator,scales = "free")+
ylim(0,100)+
labs(title="Condition estimates across NYS Ponded Waters",y="Percent of Total",x="Condition category")+
theme(axis.text.x = element_text(angle = 45,vjust = 1, hjust=1))
plot <- set_panel_size(p = plot, width = unit(4, "cm"), height = unit(4,"cm"))
gridExtra::grid.arrange(plot)
#To plot a cumulative distribution function:
ggplot(analysis$CDF,aes(x=Value,y=Estimate.P,fill=Indicator,shape=Indicator,ymin=LCB95Pct.P,ymax=UCB95Pct.P))+
geom_line()+
geom_point()+
geom_ribbon(alpha=0.5)+
ylim(0,100)+
facet_wrap(.~Indicator, scales="free")+
guides(fill=FALSE,shape=FALSE)+
labs(y="Percent of lakes")
#fetch map
library(ggmap)
library(maps)
states <- map_data("state")
ny <- subset(states, region %in% c("new york"))
att$CHLA_threshold <- factor(att$CHLA_threshold, levels=c("Good","Fair","Poor"))
ggplot() +
geom_polygon(data=ny,aes(x = long, y = lat, group = group), color = "white") +
coord_fixed(1.3) +
geom_point(data=att,aes(x=xcoord,y=ycoord,color=CHLA_threshold))+
guides(fill=FALSE)+
theme(panel.border = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"))
#list variables you are interested in and defined above
trophic <- c("phos_trophic","chla_trophic","N_LIMIT","secchi","leech","color")
minerals <- c("zebra","alkalinity")
in.situ <- c("ph","conductance")
habs <- c("microcystin","chlorophyte","cryptophyta","cyanobacteria","dinophyta")
vars.list <- list(Trophic=trophic,Minerals=minerals,`In Situ`=in.situ,HABs=habs)
n<-0
#analysis
for(i in vars.list){
n<-n+1
CatExtent <- cat_analysis(
dframe=att,
vars=i,
subpops = , #for example could separate by area category or year
siteID = "siteID",
weight = "WgtAdj", #name will probably change in future
xcoord = "xcoord",
ycoord = "ycoord")
table <- CatExtent %>%
select(Indicator,
Category,
Estimate.P,
LCB95Pct.P,
UCB95Pct.P) %>%
filter(Category!="Total") %>%
mutate(Category=factor(Category, levels=c("Poor","Fair","Good",
"Blue","Green","Brown","Murky",
"High","Weak","Uncolored",
"Medium","Low",
"Acidic","~Neutral","Slightly alk","Highly alk",
"Soft","Moderately hard","Average","Hard","Very hard",
"Highly susceptible","May be susceptible","Not susceptible",
"Eutrophic","Mesotrophic","Oligotrophic",
"N-limited","Co-limited","P-limited",
"Non-detect","Microcystin detected","Most disturbed")),
Indicator=case_when(
Indicator=="phos_trophic"~"Phosphorus",
Indicator=="chla_trophic"~"Chlorophyll-a",
Indicator=="TP_threshold"~"Total phosphorus",
Indicator=="TN_threshold"~"Total nitrogen",
Indicator=="CHLA_threshold"~"Total chlorophyll",
Indicator=="microcystin"~"Microcystin",
Indicator=="d.oxygen"~"Dissolved oxygen",
Indicator=="N_LIMIT"~"Nutrient limitation",
Indicator=="secchi"~"Secchi",
Indicator=="zebra"~"Zebra mussel susceptibility",
Indicator=="conductance"~"Conductance",
Indicator=="color"~"Color",
Indicator=="ph"~"pH",
Indicator=="leech"~"Nutrient-color status",
Indicator=='alkalinity'~"Alkalinity",
Indicator=="chlorophyte"~"% Chlorophyte",
Indicator=="cryptophyta"~"% Cryptophyta",
Indicator=="cyanobacteria"~"% Cyanobacteria",
Indicator=="dinophyta"~"% Dinophyta",
Indicator=TRUE~Indicator)
)
plot<-ggplot(table,aes(x=Category,y=Estimate.P)) +
geom_col()+
geom_errorbar(aes(ymin=LCB95Pct.P,ymax=UCB95Pct.P),width=0.2)+
theme(legend.position = "none")+
facet_wrap(.~Indicator,scales = "free")+
ylim(0,100)+
# scale_x_discrete(guide = guide_axis(n.dodge = 2))+
labs(y="Percent of Total",x="Condition category",title=paste(names(vars.list)[n],"Parameters"))+
theme(axis.text.x = element_text(angle = 45,vjust = 1, hjust=1))
plot <- set_panel_size(p = plot, width = unit(4, "cm"), height = unit(4,"cm"))
gridExtra::grid.arrange(plot)
}
att <- att %>% rename("SECCHI_m"=`DEPTH, SECCHI DISK DEPTH_SD_NA`,
"TRUE_COLOR"=`TRUE COLOR_OW_TOTAL`,
"SPEC_CONDUCTANCE"=`SPECIFIC CONDUCTANCE_epi`)
#list variables you are interested in and defined above
trophic <- c("CHLOROPHYLL_ug_L",
"NITROGEN_mg_L",
"PHOSPHORUS_mg_L",
"SECCHI_m",
"TRUE_COLOR")
minerals <- c("CALCIUM_mg_L","ALKALINITY_mg_L")
salt <- c("CHLORIDE_mg_L")
in.situ <- c("DISSOLVED_OXYGEN_mg_L",
"PH_epi",
"SPEC_CONDUCTANCE")
metals <- c("ARSENIC_BS_TOTAL",
"ARSENIC_OW_TOTAL",
"IRON_BS_TOTAL",
"IRON_OW_TOTAL",
"MAGNESIUM_BS_TOTAL",
"MAGNESIUM_OW_TOTAL")
vars.list <- list(Trophic=trophic,Minerals=minerals,Salt=salt,`In Situ`=in.situ,Metals=metals)
n<-0
for(i in vars.list){
n<-n+1
#Creates 3 estimations in list: CDF, percentiles, means.
analysis <- cont_analysis(
dframe = att,
vars = i,
subpops = ,
siteID = "siteID",
weight = "WgtAdj",
xcoord = "xcoord",
ycoord = "ycoord")
table<-analysis$CDF %>%
select(Indicator,Value,Estimate.P,LCB95Pct.P,UCB95Pct.P) %>%
filter(Indicator %in% i) %>%
mutate(Indicator=case_when(
Indicator=="CHLOROPHYLL_ug_L"~"Chlorophyll-a (ug/L)",
Indicator=="NITROGEN_mg_L"~"Total nitrogen (mg/L)",
Indicator=="PHOSPHORUS_mg_L"~"Total phosphorus (mg/L)",
Indicator=="DISSOLVED_OXYGEN_mg_L"~"Dissolved oxygen (mg/L)",
Indicator=="ALKALINITY_mg_L"~"Alkalinity (mg/L)",
Indicator=="CHLORIDE_mg_L"~"Chloride (mg/L)",
Indicator=="CALCIUM_mg_L"~"Calcium (mg/L)",
Indicator=="SECCHI_m"~"Secchi disk depth (m)",
Indicator=="TRUE_COLOR"~"True color (PCU)",
Indicator=="SPEC_CONDUCTANCE"~"Conductance (mS/cm)",
Indicator=="PH_epi"~"pH (SU)",
Indicator=="ARSENIC_BS_TOTAL"~"Arsenic, bottom (ug/L)",
Indicator=="ARSENIC_OW_TOTAL"~"Arsenic, open water (ug/L)",
Indicator=="IRON_BS_TOTAL"~"Iron, bottom (ug/L)",
Indicator=="IRON_OW_TOTAL"~"Iron, open water (ug/L)",
Indicator=="MAGNESIUM_BS_TOTAL"~"Magnesium, bottom (ug/L)",
Indicator=="MAGNESIUM_OW_TOTAL"~"Magnesium, open water (ug/L)",
Indicator=TRUE~Indicator
)) %>%
mutate(Lower=case_when(
Indicator=="Chlorophyll-a (ug/L)"~2,
Indicator=="Total phosphorus (mg/L)"~0.01,
Indicator=="Specific conductance (mS/cm)"~125,
Indicator=="Alkalinity (mg/L)"~60,
Indicator=="Calcium (mg/L)"~10,
Indicator=="True color (PCU)"~10,
Indicator=="Secchi disk depth (m)"~2,
Indicator=TRUE~-5)) %>%
mutate(Upper=case_when(
Indicator=="Chlorophyll-a (ug/L)"~8,
Indicator=="Total phosphorus (mg/L)"~0.02,
Indicator=="Specific conductance (mS/cm)"~250,
Indicator=="Alkalinity (mg/L)"~120,
Indicator=="Calcium (mg/L)"~20,
Indicator=="True color (PCU)"~20,
Indicator=="Secchi disk depth (m)"~5,
Indicator=TRUE~-5))
plot<-ggplot(table,aes(x=Value,y=Estimate.P,ymin=LCB95Pct.P,ymax=UCB95Pct.P))+
geom_line()+
geom_point()+
geom_ribbon(alpha=0.5)+
ylim(0,100)+
scale_x_continuous(limits = c(0, NA))+
guides(fill="none",shape="none")+
facet_wrap(.~Indicator, scales="free")+
geom_vline(aes(xintercept=Lower),linetype="dashed")+
geom_vline(aes(xintercept=Upper),linetype="dashed")+
labs(y="Percent of lakes",title=paste(names(vars.list[n]),"Parameters"))
plot <- set_panel_size(p = plot, width = unit(4, "cm"), height = unit(4,"cm"))
gridExtra::grid.arrange(plot)
}
Here I make the comparisons to NLA and NH data from above, as well as to historical LMAS sampling. The values for LMAS sampling are derived from a separate script that counts the number of lakes in each size class from samples in the last 10 years.
## During execution of the program, 3 warning messages were generated. The warning
## messages are stored in a data frame named 'warn_df'. Enter the following
## command to view the warning messages: warnprnt()
## To view a subset of the warning messages (say, messages number 1, 3, and 5),
## enter the following command: warnprnt(m=c(1,3,5))
## During execution of the program, a warning message was generated. The warning
## message is stored in a data frame named 'warn_df'. Enter the following command
## to view the warning message: warnprnt()
## During execution of the program, 10 warning messages were generated. The warning
## messages are stored in a data frame named 'warn_df'. Enter the following
## command to view the warning messages: warnprnt()
## To view a subset of the warning messages (say, messages number 1, 3, and 5),
## enter the following command: warnprnt(m=c(1,3,5))
cats <- rbind(ny.cat,nh.cat,nla.cat,nap.cat)
plot <- ggplot(cats,aes(x=Category,y=Estimate.P,color=Study,shape=Study))+
geom_point(position=position_dodge(width=0.2))+
ylim(0,100)+
geom_errorbar(aes(ymin=LCB95Pct.P, ymax=UCB95Pct.P), width=.25, position=position_dodge(width=0.2))+
facet_wrap(.~Indicator, scales="free")
plot <- set_panel_size(p = plot, width = unit(4, "cm"), height = unit(4,"cm"))
gridExtra::grid.arrange(plot)
## During execution of the program, a warning message was generated. The warning
## message is stored in a data frame named 'warn_df'. Enter the following command
## to view the warning message: warnprnt()
## During execution of the program, 4 warning messages were generated. The warning
## messages are stored in a data frame named 'warn_df'. Enter the following
## command to view the warning messages: warnprnt()
## To view a subset of the warning messages (say, messages number 1, 3, and 5),
## enter the following command: warnprnt(m=c(1,3,5))
plot <- ggplot(all,aes(x=Value,y=Estimate.P,color=Study,shape=Study,linetype=Study))+
geom_line()+
ylim(0,100)+
geom_vline(data=table,aes(xintercept=Low),linetype="dashed")+
geom_vline(data=table,aes(xintercept=High),linetype="dashed")+
scale_linetype_manual(values=c("solid","dotdash", "dotted","dashed"))+
facet_wrap(.~Indicator, scales="free")
plot <- set_panel_size(p = plot, width = unit(4, "cm"), height = unit(4,"cm"))
gridExtra::grid.arrange(plot)
The plot below shows the distribution of the size classes and trophic classes included in the data frame. For comparison, I’ve plotted the proportion of NYS lakes sampled in each size class since 2010.
#Plotting data frame distribution as well as NYS sample distribution collected since 2010
#LMAS numbers retrieved in 2020
frame<-att %>%
select(PROB_CAT,AREA_CAT) %>%
rename(size_category=AREA_CAT) %>%
distinct() %>%
mutate(Probability_Sites = case_when( #statewide estimate
endsWith(size_category, "(1,4]") ~ ((1828/6437)*100),
endsWith(size_category, "(4,10]") ~ ((2490/6437)*100),
endsWith(size_category, "(10,20]") ~ ((1003/6437)*100),
endsWith(size_category, "(20,50]") ~ ((616/6437)*100),
endsWith(size_category, ">50") ~ ((500/6437)*100))
) %>%
mutate(LMAS_Sites = case_when(
endsWith(size_category, "(1,4]") ~ (7.64),
endsWith(size_category, "(4,10]") ~ (16.7),
endsWith(size_category, "(10,20]") ~ (16.2),
endsWith(size_category, "(20,50]") ~ (19.3),
endsWith(size_category, ">50") ~ (40.1)
)) %>%
mutate(Target_Sampled = case_when( #lakes sampled
endsWith(size_category, "(1,4]") ~ (7/62*100),
endsWith(size_category, "(4,10]") ~ (6/62*100),
endsWith(size_category, "(10,20]") ~ (4/62*100),
endsWith(size_category, "(20,50]") ~ (17/62*100),
endsWith(size_category, ">50") ~ (28/62*100)
)) %>%
mutate(Evaluated_Lakes = case_when( #lakes evaluated in design frame
endsWith(size_category, "(1,4]") ~ (29/116*100),
endsWith(size_category, "(4,10]") ~ (15/116*100),
endsWith(size_category, "(10,20]") ~ (18/116*100),
endsWith(size_category, "(20,50]") ~ (21/116*100),
endsWith(size_category, ">50") ~ (33/116*100)
)) %>%
mutate(Design_Frame = case_when( #lakes in design frame
endsWith(size_category, "(1,4]") ~ (102/500*100),
endsWith(size_category, "(4,10]") ~ (87/500*100),
endsWith(size_category, "(10,20]") ~ (98/500*100),
endsWith(size_category, "(20,50]") ~ (105/500*100),
endsWith(size_category, ">50") ~ (108/500*100)
)) %>%
rename("Statewide estimate"=Probability_Sites,
"LMAS"=LMAS_Sites,
"Sampled lakes"=Target_Sampled,
"Evaluated lakes"=Evaluated_Lakes,
"Design frame"=Design_Frame) %>%
ungroup() %>%
select(-PROB_CAT) %>%
gather(study,percentage,-size_category) %>%
arrange(study,size_category) %>%
mutate(percentage=as.numeric(percentage),
size_category=factor(size_category,levels=c("(1,4]","(4,10]","(10,20]","(20,50]",">50")))
frame$study <- factor(frame$study, levels=c("Statewide estimate","LMAS","Sampled lakes","Evaluated lakes","Design frame"))
ggplot(frame, aes(x=study,y=percentage)) +
geom_col_pattern(stat = "identity", position = "stack",
aes(pattern_type = size_category,
pattern_fill = size_category),
pattern= 'magick',
fill= 'white',
colour= 'black') +
ylim(0,100)+
scale_pattern_type_discrete(choices = gridpattern::names_magick) +
scale_pattern_manual(guide = guide_legend(override.aes = list(pattern_scale = 0.5)))+
labs(title="Size classes in the Probability Study versus LMAS samples since 2010",y="Percent of Total",x="Size Category (hectares)")+
theme(legend.key.height = unit(2, 'cm'),
legend.key.width = unit(2, 'cm'),
axis.text.x = element_text(angle = 45,vjust = 1, hjust=1))
forplot<-probability.cat %>%
mutate(study="probability") %>%
add_row(Indicator="Chlorophyll-a",
Category="Eutrophic",
study="LMAS",
Estimate.P=((215/473)*100)) %>%
add_row(Indicator="Chlorophyll-a",
Category="Mesotrophic",
study="LMAS",
Estimate.P=((182/473)*100)) %>%
add_row(Indicator="Chlorophyll-a",
Category="Oligotrophic",
study="LMAS",
Estimate.P=((76/473)*100)) %>%
add_row(Indicator="Phosphorus",
Category="Eutrophic",
study="LMAS",
Estimate.P=((211/473)*100)) %>%
add_row(Indicator="Phosphorus",
Category="Mesotrophic",
study="LMAS",
Estimate.P=((141/473)*100)) %>%
add_row(Indicator="Phosphorus",
Category="Oligotrophic",
study="LMAS",
Estimate.P=((121/473)*100)) %>%
add_row(Indicator="Secchi",
Category="Eutrophic",
study="LMAS",
Estimate.P=((371/700)*100)) %>%
add_row(Indicator="Secchi",
Category="Mesotrophic",
study="LMAS",
Estimate.P=((279/700)*100)) %>%
add_row(Indicator="Secchi",
Category="Oligotrophic",
study="LMAS",
Estimate.P=((50/700)*100)) %>%
mutate(LCB95Pct.P=ifelse(is.na(LCB95Pct.P),Estimate.P,LCB95Pct.P),
UCB95Pct.P=ifelse(is.na(UCB95Pct.P),Estimate.P,UCB95Pct.P))
plot <- ggplot(forplot, aes(x=Category,fill=study)) +
geom_bar_pattern(stat = "identity", position = position_dodge(),
aes(y=Estimate.P,
pattern_type = study,
pattern_fill = study),
pattern= 'magick',
fill= 'white',
colour= 'black',
pattern_scale = 0.5,
pattern_key_scale_factor = 1.1)+
geom_errorbar(aes(ymin=LCB95Pct.P, ymax=UCB95Pct.P), width=0.2,
position=position_dodge(.9))+
facet_wrap(~Indicator,scales = "free")+
ylim(0,100)+
labs(title="Trophic Classes across NYS versus LMAS samples since 2010",y="Percent of Total",x="Trophic Classes")+
theme(axis.text.x = element_text(angle = 45,vjust = 1, hjust=1),
legend.key.height = unit(1, 'cm'),
legend.key.width = unit(1, 'cm'))+
scale_pattern_type_discrete(choices = c("gray15","bricks"))
plot <- set_panel_size(p = plot, width = unit(4, "cm"), height = unit(4,"cm"))
gridExtra::grid.arrange(plot)
sessionInfo(package=NULL)
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] maps_3.4.0 ggmap_3.0.0 spsurvey_5.0.1 survey_4.1-1
## [5] survival_3.2-13 Matrix_1.3-4 sf_1.0-3 ggpattern_0.4.2
## [9] egg_0.4.5 gridExtra_2.3 lubridate_1.8.0 huxtable_5.4.0
## [13] forcats_0.5.1 stringr_1.4.0 dplyr_1.0.7 purrr_0.3.4
## [17] readr_2.0.2 tidyr_1.1.4 tibble_3.1.6 ggplot2_3.3.5
## [21] tidyverse_1.3.1
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-153 bitops_1.0-7 fs_1.5.0
## [4] httr_1.4.2 tools_4.1.2 backports_1.3.0
## [7] bslib_0.3.1 utf8_1.2.2 R6_2.5.1
## [10] KernSmooth_2.23-20 AlgDesign_1.2.0 DBI_1.1.1
## [13] colorspace_2.0-2 sp_1.4-5 withr_2.4.2
## [16] tidyselect_1.1.1 compiler_4.1.2 cli_3.1.0
## [19] rvest_1.0.2 xml2_1.3.2 labeling_0.4.2
## [22] sass_0.4.0 scales_1.1.1 classInt_0.4-3
## [25] proxy_0.4-26 commonmark_1.7 crossdes_1.1-1
## [28] digest_0.6.28 minqa_1.2.4 rmarkdown_2.11
## [31] jpeg_0.1-9 pkgconfig_2.0.3 htmltools_0.5.2
## [34] lme4_1.1-27.1 highr_0.9 dbplyr_2.1.1
## [37] fastmap_1.1.0 rlang_0.4.12 readxl_1.3.1
## [40] rstudioapi_0.13 farver_2.1.0 jquerylib_0.1.4
## [43] generics_0.1.1 jsonlite_1.7.2 gtools_3.9.2
## [46] magrittr_2.0.1 Rcpp_1.0.7 munsell_0.5.0
## [49] fansi_0.5.0 lifecycle_1.0.1 stringi_1.7.5
## [52] yaml_2.2.1 MASS_7.3-54 plyr_1.8.6
## [55] crayon_1.4.2 deldir_1.0-6 lattice_0.20-45
## [58] haven_2.4.3 splines_4.1.2 hms_1.1.1
## [61] magick_2.7.3 knitr_1.36 pillar_1.6.4
## [64] rjson_0.2.20 boot_1.3-28 reprex_2.0.1
## [67] glue_1.5.0 evaluate_0.14 mitools_2.4
## [70] modelr_0.1.8 png_0.1-7 vctrs_0.3.8
## [73] nloptr_1.2.2.3 tzdb_0.2.0 RgoogleMaps_1.4.5.3
## [76] cellranger_1.1.0 gtable_0.3.0 assertthat_0.2.1
## [79] cachem_1.0.6 xfun_0.28 broom_0.7.10
## [82] e1071_1.7-9 class_7.3-19 memoise_2.0.0
## [85] gridpattern_0.5.1 units_0.7-2 ellipsis_0.3.2
end.time <- Sys.time()
elapsed.time <- round((end.time - start.time), 3)
print(elapsed.time)
## Time difference of 12.562 mins